readStructure <- function(path) {
    k <- as.numeric(sub(".+-(.*)", "\\1", str_remove(basename(path), "\\..*") ))
    kpops <- paste0("k", 1:k)
    read_lines(path) |>
        str_remove("^\\s+") |>
        str_remove("\\s+$") |>
        str_remove(": ") |>
        str_replace_all("\\s+", "\t") |>
        as_tibble() |>
        separate(col = value, into = c("i1", "i2", "i3", "i4", kpops), sep = "\t") |>
        select(i1, starts_with("k")) |>
        pivot_longer(names_to = "k-value", values_to = "proportion", starts_with("k")) |>
        mutate(
            proportion = as.numeric(proportion),
            i1 = as.numeric(i1)
        )
}

# Function that takes the PCA list we'll generate below and plot the desired PCs
# for the specified population
plotPCA <- function(pca_list, species, pcA, pcB, xstep = 1, ystep = 1, pal = "Paired") {
    sp_pca <- pca_list |>
        pluck(species)
    
    # x-axis: max/min
    xmin <- sp_pca |>
        pull(pcA) |>
        min(na.rm = TRUE) |>
        floor()
    
    xmax <- sp_pca |>
        pull(pcA) |>
        max(na.rm = TRUE) |>
        ceiling()
    
    # y-axis: max/min
    ymin <- sp_pca |>
        pull(pcB) |>
        min(na.rm = TRUE) |>
        floor()
    
    ymax <- sp_pca |>
        pull(pcB) |>
        max(na.rm = TRUE) |>
        ceiling()

    sp_pca |>
        ggplot(aes(x = .data[[pcA]], y = .data[[pcB]], colour = population)) +
        geom_point(size = 2) +
        scale_color_brewer(palette = pal) +
        labs(title = glue::glue("{pcA} vs {pcB}")) +
        scale_y_continuous(
            limits = c(ymin, ymax),
            breaks = seq(ymin, ymax, by = ystep),
            labels = as.character(seq(ymin, ymax, by = ystep)),
            expand = c(0, 0)
        ) +
        scale_x_continuous(
            limits = c(xmin, xmax),
            breaks = seq(xmin, xmax, by = xstep),
            labels = as.character(seq(xmin, xmax, by = xstep)),
            expand = c(0, 0)
        ) +
        theme(
            legend.position = "bottom",
            legend.title = element_text(hjust = 0.5, face = "bold", size = 14),
            legend.title.position = "top",
            plot.title = element_text(hjust = 0.5),
            title = element_text(face = "bold")
        )
}

PCA

Here we load the PCA CSV file generated by the Ipyrad.analysis toolkit.

lst_pca <- fs::dir_ls(here("results", "population-structure", "pca"), glob = "*.csv") |>
    (\(x) set_names(x, str_remove(basename(x), "-.*")))() |>
    imap(\(x, i) {
        tmp <- x |>
            read_csv(col_names = TRUE, col_types = cols())
        colnames(tmp) <- c("sample", paste0("PC", 1:(ncol(tmp) - 1)))
        pops |>
            filter(species == i) |>
            left_join(tmp)
    })

Aipysurus laevis

pc_1_2 <- plotPCA(pca_list = lst_pca, species = "ALA", pcA = "PC1", pcB = "PC2", xstep = 2)
pc_1_3 <- plotPCA(pca_list = lst_pca, species = "ALA", pcA = "PC1", pcB = "PC3", xstep = 2)
pc_2_3 <- plotPCA(pca_list = lst_pca, species = "ALA", pcA = "PC2", pcB = "PC3")


design <- "12\n34"

pc_1_2 + pc_1_3 + pc_2_3 + guide_area() +
    plot_layout(
        design = design,
        guides = "collect",
        axes = "collect"
    ) &
    guides(colour = guide_legend(ncol = 3, override.aes = list(size = 3)))

Hydrophis major

pc_1_2 <- plotPCA(pca_list = lst_pca, species = "HMA", pcA = "PC1", pcB = "PC2", xstep = 2, ystep = 2)
pc_1_3 <- plotPCA(pca_list = lst_pca, species = "HMA", pcA = "PC1", pcB = "PC3", xstep = 2, ystep = 2)
pc_2_3 <- plotPCA(pca_list = lst_pca, species = "HMA", pcA = "PC2", pcB = "PC3", ystep = 2)

pc_1_2 + pc_1_3 + pc_2_3 + guide_area() +
    plot_layout(
        design = design, 
        guides = "collect",
        axes = "collect"
    ) &
    guides(colour = guide_legend(ncol = 3, override.aes = list(size = 4)))

Hydrophis stokesii

pc_1_2 <- plotPCA(pca_list = lst_pca, species = "HST", pcA = "PC1", pcB = "PC2", xstep = 2, ystep = 2)
pc_1_3 <- plotPCA(pca_list = lst_pca, species = "HST", pcA = "PC1", pcB = "PC3", xstep = 2, ystep = 2)
pc_2_3 <- plotPCA(pca_list = lst_pca, species = "HST", pcA = "PC2", pcB = "PC3", xstep = 2, ystep = 2)

pc_1_2 + pc_1_3 + pc_2_3 + guide_area() +
    plot_layout(
        design = design, 
        guides = "collect",
        axes = "collect"
    ) &
    guides(colour = guide_legend(ncol = 3, override.aes = list(size = 4)))

STRUCTURE

Below we plot the STRUCTURE results generated for all sample in each species. If we wanted to generate a plot with a subset of the samples (e.g. a specific couple of populations), we’d need to re-run STRUCTURE with just those samples.

First we load in the STRUCTURE output files using our custom R function (see above).

pops_structure <- pops |> mutate(i1 = row_number(), .by = species)

structure_df <- fs::dir_ls(
    here("results", "population-structure"), 
    glob = "*clumpp.outfile", 
    recurse = TRUE
) |>
    (\(x) set_names(x, str_remove(basename(x), "\\..*")))() |>
    map(readStructure) |>
    list_rbind(names_to = "species") |>
    separate_wider_delim(delim = "-", names = c("species", "K"), too_many = "merge", cols = species) |>
    left_join(pops_structure)

Choosing the correct K (N-ancestral populations)

Choosing the best K value can be determined by plotting the estimated log-probability for each K value, in addition to the delta-K value. The values can be interpreted as follows:

  • Estimated log-probability: Higher values equal better model fit.
  • Delta-K: The rate of change between successive K-values. Larger values indicate reater change in the model fit
etable <- fs::dir_ls(here("results"), glob = "*etable.csv", recurse = TRUE) |>
    read_csv(col_names = TRUE, col_types = cols(), id = "species") |>
    mutate(species = str_remove(basename(species), "-etable.csv")) |>
    rename(K = `...1`)
prb <- etable |>
    ggplot(aes(x = K, y = estLnProbMean, colour = species)) +
    geom_point() +
    geom_line() +
    scale_y_continuous(
        name = "Estimated mean log-probability",
        breaks = seq(0, -105000, -10000),
        limits = c(-105e3, -4e4),
        labels = scales::label_number(style_negative = "minus")
    ) +
    scale_x_continuous(breaks = seq(2, 9, 1))

dk <- etable |>
    ggplot(aes(x = K, y = deltaK, colour = species)) +
    geom_point() +
    geom_line() +
    scale_x_continuous(breaks = seq(2, 9, 1)) +
    scale_y_continuous(
        name = "Delta-K",
        breaks = seq(0, 8, 1)
    )

prb/dk +
    plot_layout(axes = "collect", guides = "collect")

For the three species, with the current data, we can determine the following:

  • Aipysurus: The two highest log-probability values are at k = 5 and K = 7. The Delta-K for both of these values is quite high, indicating that both K-values improved the model fit substantially.
  • Hydrophis major: K = 5 appears to be the best choice for H. major. The estimated log-probability is relatively stable for most values of K, before dipping once reaching higher values. Delta-K peaks at K = 5, with little improvement after.
  • Hydrophis stokesii: The estimated log-probabilities for stokesii show a similar pattern to H. major, being consistent across multiple K values. Delta-K spikes at K = 4, indicating that this is likely the best value for the current dataset.

Aipysurus laevis

structure_df |>
    filter(species == "ALA", K == "K-5") |>
    ggplot(aes(x = sample, y = proportion, fill = `k-value`)) +
    geom_col(position = "stack") +
    scale_fill_brewer(palette = "Set2") +
    scale_y_continuous(expand = c(0, 0)) +
    labs(x = NULL) + 
    facet_grid(
        cols = vars(population), scales = "free_x", space = "free_x"
        
    ) +
    guides(fill = guide_legend(title.position = "bottom")) +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.98),
        strip.text = element_text(angle = 90, vjust = 0.5, hjust = 0.1),
        strip.background = element_blank(),
        legend.position = "bottom",
        legend.title = element_text(hjust = 0.5),
        panel.spacing.x = unit(0.2, "line")
    )

Hydrophis major

structure_df |>
    filter(species == "HMA", K == "K-5") |>
    ggplot(aes(x = sample, y = proportion, fill = `k-value`)) +
    geom_col(position = "stack") +
    scale_fill_brewer(palette = "Set2") +
    scale_y_continuous(expand = c(0, 0)) +
    labs(x = NULL) + 
    facet_grid(
        cols = vars(population), scales = "free_x", space = "free_x"
        
    ) +
    guides(fill = guide_legend(title.position = "bottom")) +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.98),
        strip.text = element_text(angle = 90, vjust = 0.5, hjust = 0.1),
        strip.background = element_blank(),
        legend.position = "bottom",
        legend.title = element_text(hjust = 0.5),
        panel.spacing.x = unit(0.2, "line")
    )

Hydrophis stokesii

structure_df |>
    filter(species == "HST", K == "K-4") |>
    ggplot(aes(x = sample, y = proportion, fill = `k-value`)) +
    geom_col(position = "stack") +
    scale_fill_brewer(palette = "Set2") +
    scale_y_continuous(expand = c(0, 0)) +
    labs(x = NULL) +
    facet_grid(
        cols = vars(population), scales = "free_x", space = "free_x"
        
    ) +
    guides(fill = guide_legend(title.position = "bottom")) +
    theme(
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.98),
        strip.text = element_text(angle = 90, vjust = 0.5, hjust = 0.1),
        strip.background = element_blank(),
        legend.position = "bottom",
        legend.title = element_text(hjust = 0.5),
        panel.spacing.x = unit(0.2, "line")
    )